// =============================================================
// FORTH-Processor K1 - based on J1
//
// Author   : Klaus Kohl-Schoepe
// Date     : 11.05.2020
// File Name: K1.v
// Copyright (C) 2020 Klaus Kohl-Schoepe (kks@designin.de)
// =============================================================

// =============================================================
// K1 Opcode:
//
// 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1  0 = Opcode (16 Bit aligned)
// Literal:
//  0  x  x  x  x  x  x  x  x  x  x  x  x  x  x  x = Lit (bit 14 = bit 15 => -16384 ... +16383)
// Branches:
//  1  0  x  x  x  x  x  x  x  x  x  x  x  x  x  x = CALL (absolut: x * 2)
//  |  1  0  0  x  x  x  x  x  x  x  x  x  x  x  x = BRANCH (relativ: ±x * 2)
//  |  |  |  1  x  x  x  x  x  x  x  x  x  x  x  x = 0BRANCH (relativ: ±x * 2)
//  |  |  1  0  x  x  x  x  x  x  x  x  x  x  x  x = NEXT (relativ: ±x * 2 if R != 0 else RDROP)
// Memory and Alu:
//  1  1  1  1  0  ;  >R >N -- ALU  --  BW @! s  s = Memory Read/Write (@!=1: Write)
//  1  1  1  1  1  ;  >R >N -- ALU  --  r  r  s  s = Alu
//  ; and >R:
//      - Memory   0  0                    0 = @: rstack and stack unchanged
//                 0  0                    1 = !: Stack -1
//                 0  1                    0 = @: T -> R 
//                 1  0 = R -> PC (Exit)
//                 1  1 = Execute (PC+2 -> R; T -> PC)
//      - ALU:     0  0 = R unchanged
//                 0  1 = T -> R
//                 1  0 = R -> PC (Exit)
//                 1  1                 0  0 = Goto (T -> PC)
//                 1  1                 0  1 = Execute (PC+2 -> R; T -> PC)
//                 1  1                 1  1 = RDROP, GOTO 
//                 1  1                 1  1 = RDROP, Execute 
//  >N:                  0 = N unchanged
//                       1 = T -> N
//  BW:                                 0 = Byte (TOS[0])
//                                      1 = Wort
//  @!:                                    0 = Lesen
//                                         1 = Schreiben
//  addition:                           1  1       = exit if rr = %11
//                                      0  1       = T > R if rr = %01
//  ALU (4-Bit):            0  0  0  0  |  |  |  | = T        (special: R)
//                          |  |  |  1  |  |  |  | = N        (special: RA = R with bit 0=0)
//                          |  |  1  0  |  |  |  | = +        (special: T+1)
//                          |  |  |  1  |  |  |  | = -        (special: T-1)
//                          |  1  0  0  |  |  |  | = AND      (special: T+2)
//                          |  |  |  1  |  |  |  | = OR       (special: T-2)
//                          |  |  1  0  |  |  |  | = XOR      (special: Carry - 0 or 1)
//                          |  |  |  1  |  |  |  | = INVERT   (special: INV14 = T xor $4000)
//                          1  0  0  0  |  |  |  | = ASHIFT   (nos >> tos[3:0] => tos or special: tos >> 1 - signed)
//                          |  |  |  1  |  |  |  | = RSHIFT   (nos >> tos[3:0] => tos or special: tos >> 1 - unsigned)
//                          |  |  1  0  |  |  |  | = LSHIFT   (nos << tos[3:0] => tos or special: tos << 1)
//                          |  |  |  1  |  |  |  | = 0=       (special: 0<)
//                          |  1  0  0  |  |  |  | = U<       (< unsigned or special: dsp)
//                          |  |  |  1  |  |  |  | = <        (< signed   or special: rsp)
//                          |  |  1  0  |  |  |  | = Divstep  (if T<R then T:N << 1 + 0 else (T-R:N) << 1 + 1) - not with memory 
//                          |  |  |  1  |  |  |  | = *        (special: UM* = T * N => T:N )
//  Stacks:                             r  r  |  | = Returnstack-Changes (-1 .. +1)
//                                      |  |  s  s = Datenstack-Changes (-1 .. +1; -2 is special)
//                                      Special: no stack changes if >N: not used
//
// During read from memory ALU use data instead of T
// If ss == 2'b10 (-2) then special function is selected - push T if >N (insn[9])
// Carry flag is modified by +, -, and shift operation
// =============================================================

// =============================================================
// Default parameter (only 16-bit data and addresses tested)
// =============================================================

`default_nettype none
`define MEMADDR 16      // 32K Bytes program memory + 32K I/O
`define WIDTH 16        // 16 bit data size
`define DEPTH 8         // 2^8 = 256 stack entries (512 for both stacks in one 9Kb memory
`define SPTRS 9         // but stack pointer size is 9 to recognize overflow

//`timescale 100 ns / 10 ns
`timescale 1 ns / 1 ps

// =============================================================
// Modul K1
// =============================================================

module K1(
  input  wire clk,
  input  wire resetq,
  // Port A only used to read code (Bit 0 alway 0 and word size)
  output wire [`MEMADDR-1:0] code_addr, // Program adresse (Port A - Words)
  input  wire [`WIDTH-1:0] insn,        // Program data (Port A)
  // Port B used for read/write memory and I/O
  output wire [`MEMADDR-1:0] mem_addr,  // Program write addresse (Port B)
  output wire mem_bw,                   // Byte or word
  output wire [`WIDTH-1:0] dout,        // Data for both ports and I/O
  input  wire [`WIDTH-1:0] din,         // Memory data (Port B) or I/O
  output wire mem_rd,                   // Read from memory
  output wire mem_wr                    // Write to Memory
);

  reg reboot = 1'b1;                    // Avoid wrong code at startup
    
  // Register
  reg  [`WIDTH-1:0] pc, pcN;            // Program pointer (bit 0 is used for Carry)
  reg  carry;                           // Next carry

  reg  [`WIDTH-1:0] tos, tosN;          // Top of data stack (NOS from DSTACK)
  reg  [`SPTRS-1:0] dsp, dspN;          // Pointer for rest of data stack
  reg  dspW;                            // Flag for data stack write
  wire [`WIDTH-1:0] nos;                // Next of data stack is in memory
  reg  [`WIDTH-1:0] nosN;

  reg  [`SPTRS-1:0] rsp, rspN;          // Pointer for return stack
  reg  rspW;                            // Flag for return stack write
  wire [`WIDTH-1:0] tor;                // Top of return stack in memory
  reg  [`WIDTH-1:0] torN;                

  // Flags
  wire is_literal = !insn[15];
  wire is_call    = (insn[15:14] == 2'b10);
  wire is_0branch = (insn[15:12] == 4'b1101);
  wire is_next    = (insn[15:12] == 4'b1110);
  wire is_mem     = (insn[15:11] == 5'b1111_0);
  wire is_alu     = (insn[15:11] == 5'b1111_1);
  wire is_special = is_alu & (insn[ 1: 0] == 2'b10); // used for alternative ALU operation
  wire is_dstack  = is_alu & is_special & (insn[7:4] == 4'b1110) & (tos[`WIDTH-1]); // stack changes
  wire is_rstack  = is_alu & is_special & (insn[7:4] == 4'b1110) & (tos[`WIDTH-2]); // stack changes

  // Support functions
  wire [  `WIDTH-1:0] pc_plus_2   = pc + 2'b10;                    // Next opcode address (+2) with old carry
  wire [  `WIDTH-1:0] pc_rel      = pc_plus_2 + {{(`WIDTH - 13){insn[11]}}, insn[11:0], 1'b0};
  wire [  `WIDTH-1:0] tor_minus_1 = tor - 16'd1;                   // R stack minus 1 for NEXT
  wire [  `WIDTH-1:0] tosX        = (is_mem & mem_rd) ? din : tos; // using tos or memory as second operator
  wire [2*`WIDTH-1:0] multiplyer  = tos * nos;                     // for UM* = TOS * NOS
  wire [  `WIDTH  :0] dstos       = {tos, nos[`WIDTH-1]} - {1'b0, tor};    // for division step: tos:nos[`WIDTH-1] - tor 

  // Assignment for memory access
  assign code_addr = reboot ? {(`WIDTH){1'b0}} : {{(`MEMADDR-`WIDTH){1'b0}}, pcN[`WIDTH-1:1], 1'b0}; // Next code address
  assign mem_addr  = {{(`MEMADDR-`WIDTH){1'b0}}, (mem_wr ? tos : tosN)}; // next address or actual address if write
  assign dout      = nos;                                                // NOS also used as data for memory write
  assign mem_bw    = insn[3];                                            // 0=byte, 1=word
  assign mem_wr    = !reboot & is_mem &  insn[2];                     // write signal to memory
  assign mem_rd    = !reboot & is_mem & !insn[2];                     // read signal from memory

  // The D and R stacks
  reg [1:0] dspI, rspI;
// Old version
//  stack #(.WIDTH(`WIDTH), .DEPTH(2 ** `DEPTH)) dstack(.clk(clk), .rd(nos), .we(dspW), .wd(nosN), .delta(dspI));
//  stack #(.WIDTH(`WIDTH), .DEPTH(2 ** `DEPTH)) rstack(.clk(clk), .rd(tor), .we(rspW), .wd(torN), .delta(rspI));
// new version with dual port memory
  myStacks stacks(
    .address_a({1'b0, dspN[`DEPTH-1:0]}),  // Port A: Datastack   - 256 entries from 0 ... 255
    .address_b({1'b1, rspN[`DEPTH-1:0]}),  // Port B: Returnstack - 256 entries from 256 ... 511
    .clock(clk),
    .data_a(nosN),                         // next NOS if write
    .data_b(torN),                         // next TOR if write
    .wren_a(dspW),                         // Write next NOS
    .wren_b(rspW),                         // Write next TOR
    .q_a(nos),                             // actual NOS
    .q_b(tor));                            // actual NOS

  // Main part: calculating next values
  always @* begin
    // Next TOS
    if(is_alu | is_mem) begin
      if(!is_special) begin // Standard ALU
        casez (insn[7:4])
          4'b0000: {carry, tosN} = {pc[0], tosX};                                     // Tx 
          4'b0001: {carry, tosN} = {pc[0], nos};                                      // N
          4'b0010: {carry, tosN} = {1'b0, nos} + {1'b0, tosX};                        // N + Tx
          4'b0011: {carry, tosN} = {1'b0, nos} - {1'b0, tosX};                        // N - Tx
          4'b0100: {carry, tosN} = {pc[0], (nos & tosX)};                             // N and Tx
          4'b0101: {carry, tosN} = {pc[0], (nos | tosX)};                             // N or Tx
          4'b0110: {carry, tosN} = {pc[0], (nos ^ tosX)};                             // N xor Tx
          4'b0111: {carry, tosN} = {pc[0], ~tosX};                                    // Tx xor $ffff
          4'b1000: {tosN, carry} = {{4{nos[`WIDTH - 1]}}, nos, 1'b0} >> tosX[3:0];    // N >> Tx (signed)
          4'b1001: {tosN, carry} = {4'b0                , nos, 1'b0} >> tosX[3:0];    // N >> Tx (unsigned)
          4'b1010: {carry, tosN} = {4'b0                , nos      } << tosX[3:0];    // N << Tx
          4'b1011: {carry, tosN} = {pc[0], {`WIDTH{!tosX}}};                        // T = 0 ?
          4'b1100: {carry, tosN} = {pc[0], {`WIDTH{(nos < tosX)}}};                   // N < T (unsigned) ?
          4'b1101: {carry, tosN} = {pc[0], {`WIDTH{($signed(nos) < $signed(tosX))}}}; // N < T (signed) ?
          4'b1110: {carry, tosN} = dstos[`WIDTH] ? {pc[0], tos[`WIDTH-2:0], nos[`WIDTH-1]} : {pc[0], dstos[`WIDTH-1:0]}; // Divstep
          4'b1111: {carry, tosN} = {pc[0], multiplyer[(`WIDTH-1):0]};                 // N * T (low-part)
        endcase
      end else begin // Special ALU
        casez (insn[7:4])
          4'b0000: {carry, tosN} = {pc[0], tor                  };                    // R 
          4'b0001: {carry, tosN} = {pc[0], tor[`WIDTH-1:1], 1'b0};                    // RA = R with bit0=0
          4'b0010: {carry, tosN} = {1'b0, tosX} + {17'b01};                           // T+1
          4'b0011: {carry, tosN} = {1'b0, tosX} - {17'b01};                           // T-1
          4'b0100: {carry, tosN} = {1'b0, tosX} + {17'b10};                           // T+2
          4'b0101: {carry, tosN} = {1'b0, tosX} - {17'b10};                           // T-2
          4'b0110: {carry, tosN} = {pc[0], {(`WIDTH-1){1'b0}}, pc[0]};                // Carry
          4'b0111: {carry, tosN} = {pc[0], tosX ^ 16'h4000};                          // INV14
          4'b1000: {tosN, carry} = {tos[`WIDTH-1], tosX};                             // T >> 1 (signed)
          4'b1001: {tosN, carry} = {1'b0         , tosX};                             // T >> 1 (unsigned)
          4'b1010: {carry, tosN} = tosX << 1;                                         // T << 1
          4'b1011: {carry, tosN} = {pc[0], {`WIDTH{tosX[`WIDTH-1]}}};                 // T < 0 ?
          4'b1100: {carry, tosN} = {pc[0], {(`WIDTH - `SPTRS){dsp[`SPTRS-1]}}, dsp};  // DEPTH
          4'b1101: {carry, tosN} = {pc[0], {(`WIDTH - `SPTRS){rsp[`SPTRS-1]}}, rsp};  // RDEPTH
          4'b1110: {carry, tosN} = {pc[0], nos};                                      // N - used for stack changes
          4'b1111: {carry, tosN} = {pc[0], multiplyer[(2*`WIDTH-1):`WIDTH]};          // UM*
        endcase
      end
    end else begin
      if (is_literal)
        {carry, tosN} = {pc[0], {(`WIDTH - 15){insn[14]}}, insn[14:0]}; // Literal
      else
        {carry, tosN} = is_0branch ? {pc[0], nos} : {pc[0], tos};       // default: no changes on TOS except at 0BRANCH
    end

    // Next NOS
    if (is_mem | is_alu)     begin // Memory and Alu
      casez ({is_mem, is_alu, insn[8], is_special}) // insn[8] is >N
        4'b1_0_0_?: {dspW, dspI, nosN} = {1'b0, insn[1:0], nos};  // Memory read/write: only stack changes
        4'b1_0_1_?: {dspW, dspI, nosN} = {1'b1, insn[1:0], tos};  // Memory read/write with >N: stack + 1
        4'b0_1_0_0: {dspW, dspI, nosN} = (insn[7:4] == 4'b1110) ? // ALU Opcode 14 - Division Step: nos << 1 + !dstos[15]
                                         {1'b1, 2'b00    , nos[`WIDTH-2:0], !dstos[`WIDTH]} :
                                         {1'b0, insn[1:0], nos};  // else ALU and not T>N
        4'b0_1_0_1: {dspW, dspI, nosN} = (insn[7:4] == 4'b1111) ? // ALU Opcode 15 - UM*: low-part of multiplier
                                         {1'b1, 2'b00    , multiplyer[`WIDTH-1:0]} :
                                         {1'b0, 2'b00    , nos};  // special and not T>N (keep N)
        4'b0_1_1_0: {dspW, dspI, nosN} = {1'b1, insn[1:0], tos};  // ALU and T>N (dup T - change stack)
        4'b0_1_1_1: {dspW, dspI, nosN} = {1'b1, 2'b01    , tos};  // special and T>N (dup T)
        default:    {dspW, dspI, nosN} = {1'b0, 2'b00    , nos};  // Default: no changes on NOS
      endcase
    end else if (is_literal) begin // Literal ?
                    {dspW, dspI, nosN} = {1'b1, 2'b01    , tos};  // Literal: T to N
    end else if (is_0branch) begin // 0Branch
                    {dspW, dspI, nosN} = {1'b0, 2'b11    , nos};  // 0BRANCH: drop T
    end else                 begin // Default
                    {dspW, dspI, nosN} = {1'b0, 2'b00    , nos};  // no changes on NOS
	 end

    // next data stack pointer - maybe stack changes	 
	 if (is_dstack)
      dspN = tos[`SPTRS-1:0]; // set new stack pointer from TOS (drop required)
	 else
      dspN = dsp + {{(`SPTRS-1){dspI[1]}}, dspI[0]}; // set new calculated stack pointer

    // Next TOR
	 if (is_call) begin                   // Call
      {rspW, rspI, torN} = {1'b1, 2'b01, pc_plus_2};
	 end else if (is_next) begin          // NEXT: R-1 or RDROP
      {rspW, rspI, torN} = |tor ? {1'b1, 2'b00, tor_minus_1} : {1'b0, 2'b11, tor};
	 end else if (is_mem | is_alu) begin  // Memory and Alu
      casez ({is_alu, is_mem, insn[10:9]})
        4'b0_1_00: {rspW, rspI, torN} = {1'b0, 2'b00    , tor};        // Memory alone: no changes
        4'b0_1_01: {rspW, rspI, torN} = {1'b1, 2'b01    , tos};        // Memory with >R: T to R
        4'b0_1_10: {rspW, rspI, torN} = {1'b0, 2'b11    , tor};        // Memory with EXIT: R to PC
        4'b0_1_11: {rspW, rspI, torN} = {1'b0, 2'b00    , tor};        // Memory with R> and EXIT: PERFORM
        4'b1_0_?0: {rspW, rspI, torN} = {1'b0, insn[3:2], tor};        // ALU
        4'b1_0_01: {rspW, rspI, torN} = {1'b1, insn[3:2], tos};        // ALU with T>R
        4'b1_0_11: {rspW, rspI, torN} = (insn[3:2] == 2'b01) ?         // ALU with exit and T>R      
                                        {1'b1, insn[3:2], pc_plus_2} : // execute
                                        {1'b0, insn[3:2], tor};        // goto
        default:   {rspW, rspI, torN} = {1'b0, 2'b00    , tor}; 		  // Branch, 0Branch, Literal
      endcase
	 end else begin                       // Branch, 0Branch, Literal
      {rspW, rspI, torN} = {1'b0, 2'b00, tor};
	 end

    // next return stack pointer - maybe stack changes	 
	 if (is_rstack)
      rspN = tos[`SPTRS-1:0]; // set new return stack pointer from TOS (drop required)
	 else
      rspN = rsp + {{(`SPTRS-1){rspI[1]}}, rspI[0]};

    // Next PC
	 if (is_literal) begin                            // Literal ?
      pcN = pc_plus_2;                               // Literal: PC + 2
	 end else if (is_call) begin                      // Call ?
      pcN = {1'b0, insn[13:0], pc[0]};               // CALL: x*2 (absolute)
	 end else begin
      casez ({insn[13:12], insn[10:9]})
        4'b00_??:   pcN = pc_rel;                    // BRANCH pc + x*2
        4'b01_??:   pcN = |tos ? pc_plus_2 : pc_rel; // 0BRANCH: pc ± x*2 bei T=0
        4'b10_??:   pcN = |tor ? pc_rel : pc_plus_2; // NEXT: pc ± x*2
        4'b11_10:   pcN = tor;                       // Memory/ALU with EXIT: pc = R
        4'b11_11:   pcN = {tos[`WIDTH-1:0]};         // Memory/Alu with >R EXIT: pc = tos
        default :   pcN = (is_mem | is_alu) ? {pc_plus_2[`WIDTH-1:1], carry} : pc_plus_2;
      endcase
	 end
  end

  // Clock syncron activities
  always @(negedge resetq or posedge clk)
  begin
    if (!resetq) begin
      reboot <= 1'b1;               // wait one clock before start
      { pc, dsp, tos, rsp } <= 0;   // and keep PC at 0
    end else begin
      if (reboot) begin
        reboot <= 1'b0;
        { pc, dsp, tos, rsp } <= 0; // and keep PC at 0
      end else begin
       $display("\n PC: %h - %h - SP: %h / %h / %h - RP: %h / %h ", pc, insn, dsp, tos, nos, rsp, tor);
       { pc, dsp, tos, rsp } <= { pcN, dspN, tosN, rspN }; // new values
      end
	 end
  end

endmodule // K1
